//  ---------------------------------------------------------------------------
//  This file is part of reSID, a MOS6581 SID emulator engine.
//  Copyright (C) 2004  Dag Lem <resid@nimrod.no>
//
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//  ---------------------------------------------------------------------------
#include <stdint.h>
#include "resid-fp/sid.h"

#if (RESID_USE_SSE == 1)

#include <xmmintrin.h>

float convolve_sse(const float *a, const float *b, int n) {
        float out = 0.f;
        __m128 out4 = {0, 0, 0, 0};

        /* examine if we can use aligned loads on both pointers */
        int diff = (int)(a - b) & 0xf;
        /* long cast is no-op for x86-32, but x86-64 gcc needs 64 bit intermediate
         * to convince compiler we mean this. */
        unsigned int a_align = (unsigned int)(uintptr_t)a & 0xf;

        /* advance if necessary. We can't let n fall < 0, so no while (n --). */
        while (n > 0 && a_align != 0 && a_align != 16) {
                out += (*(a++)) * (*(b++));
                --n;
                a_align += 4;
        }

        int n4 = n / 4;
        if (diff == 0) {
                for (int i = 0; i < n4; i++) {
                        out4 = _mm_add_ps(out4, _mm_mul_ps(_mm_load_ps(a), _mm_load_ps(b)));
                        a += 4;
                        b += 4;
                }
        } else {
                /* XXX loadu is 4x slower than load, at least. We could at 4x memory
                 * use prepare versions of b aligned for any a alignment. We could
                 * also issue aligned loads and shuffle the halves at each iteration.
                 * Initial results indicate only very small improvements. */
                for (int i = 0; i < n4; i++) {
                        out4 = _mm_add_ps(out4, _mm_mul_ps(_mm_load_ps(a), _mm_loadu_ps(b)));
                        a += 4;
                        b += 4;
                }
        }

        out4 = _mm_add_ps(_mm_movehl_ps(out4, out4), out4);
        out4 = _mm_add_ss(_mm_shuffle_ps(out4, out4, 1), out4);
        float out_tmp;
        _mm_store_ss(&out_tmp, out4);
        out += out_tmp;

        n &= 3;

        while (n--)
                out += (*(a++)) * (*(b++));

        return out;
}
#endif
